%cd /mnt/Data/Jupyter/VisCA1
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
%load_ext rpy2.ipython
%%R
require(tidyverse)
require(ggplot2)
require(plotluck)
require(reshape2)
require(cowplot)
theme_set(theme_minimal())
lobbying = pd.read_excel("lobbying_data.xlsx")
lobbying["Total-Lobbying"] = lobbying["Compensation"] + lobbying["Expenses"]
lobbyingTree = lobbying[["entity_type", "Entity-cleaned", "Total-Lobbying"]]
lobbyingTree.rename(columns={"entity_type":"Type", "Entity-cleaned":"Entity", "Total-Lobbying":"Lobbying"}, inplace=True)
lobbyingTree.head()
%%R -i lobbyingTree
require(data.tree)
lobbyingTree$pathString <- paste('LOBBYING_TREE', lobbyingTree$Type, lobbyingTree$Entity, sep = "/")
tree <- as.Node(lobbyingTree[,])
print(tree, pruneMethod='dist', limit=20)
%%R
tree$Do(function(x) x$aggLobbying <- Aggregate(x, 'Lobbying', sum))
tree$Sort(attribute='Lobbying', decreasing=TRUE, recursive=TRUE)
print(tree, 'Lobbying', 'aggLobbying', pruneMethod='dist', limit=14)
%%R -w 8 -h 5 --units in -r 150
require(treemap)
treemap(ToDataFrameNetwork(tree, 'Lobbying', direction='climb')[9:139,],
index=c('from', 'to'), vSize='Lobbying', vColor='Lobbying', type='value', palette=heat.colors(-55))
%%R -w 12 -h 12 --units in -r 100
require(igraph)
treeGraph <- as.igraph(tree, 'aggLobbying', directed=F, direction='climb')
vertex_attr(treeGraph, 'aggLobbying')[1] <- 0
plot(treeGraph, vertex.color='lightblue', vertex.label.family='Segoe UI',
vertex.label.cex=0.55, vertex.label.color='black', layout=layout.kamada.kawai,
vertex.size=vertex_attr(treeGraph, 'aggLobbying') / (max(vertex_attr(treeGraph, 'aggLobbying')) * 0.035))
longlat = pd.read_csv("zip_codes_states.csv")
longlat = longlat[longlat["state"] == "WA"]
longlat.drop_duplicates(subset="city", inplace=True)
longlat.set_index("city", inplace=True)
longlat.head()
lobbyingGeo = lobbying[lobbying["entity_type"] == "CITIES"].join(longlat, how="left", on="City")[
["Total-Lobbying", "City", "county", "longitude", "latitude"]]
lobbyingGeo.rename(columns={"Total-Lobbying":"lobbying", "City":"city"}, inplace=True)
lobbyingGeo.drop(lobbyingGeo.index[44], inplace=True) # duplicate row
lobbyingGeo[lobbyingGeo.isnull().any(1)]
lobbyingGeo.loc[0, "county"], lobbyingGeo.loc[0, "latitude"], lobbyingGeo.loc[0, "longitude"] = "King", 47.281954, -122.250388
lobbyingGeo.loc[6, "county"], lobbyingGeo.loc[6, "latitude"], lobbyingGeo.loc[6, "longitude"] = "King", 47.46917, -122.364291
lobbyingGeo.loc[8, "county"], lobbyingGeo.loc[8, "latitude"], lobbyingGeo.loc[8, "longitude"] = "King", 47.364791, -122.104563
lobbyingGeo.loc[12, "county"], lobbyingGeo.loc[12, "latitude"], lobbyingGeo.loc[12, "longitude"] = "Pierce", 47.23206, -122.351726
lobbyingGeo.loc[38, "county"], lobbyingGeo.loc[38, "latitude"], lobbyingGeo.loc[38, "longitude"] = "King", 47.44333, -122.298767
lobbyingGeo.loc[41, "county"], lobbyingGeo.loc[41, "latitude"], lobbyingGeo.loc[41, "longitude"] = "King", 47.756904, -122.342414
lobbyingGeo.head()
# data retrieved from http://citylatitudelongitude.com/
duplicates = len(lobbyingGeo.drop_duplicates(subset=["longitude", "latitude"])) != len(lobbyingGeo)
print("Any Duplicate coordinates present?:", duplicates)
lobbyingGeo[lobbyingGeo["latitude"] ==
float(lobbyingGeo[~lobbyingGeo.isin(lobbyingGeo.drop_duplicates(
subset=["longitude", "latitude"]))].dropna()["latitude"])]
lobbyingGeo.loc[11, "latitude"], lobbyingGeo.loc[41, "longitude"] = 47.322323, -122.312622
# sort out seattle location
print("Any Duplicate coordinates present?:", len(lobbyingGeo.drop_duplicates(subset=["longitude", "latitude"])) != len(lobbyingGeo))
%%R -i lobbyingGeo -w 6 -h 4 --units in -r 200
require(ggmap)
lat <- c(min(lobbyingGeo$latitude), max(lobbyingGeo$latitude))
lon <- c(min(lobbyingGeo$longitude), max(lobbyingGeo$longitude))
plot <- get_map(location=c(mean(lon), mean(lat)), zoom=6, source='google', maptype='terrain')
ggmap(plot, extent='device') +
geom_density2d(data=lobbyingGeo, aes(x=longitude, y=latitude), size=0.1, color='black') +
stat_density2d(data=lobbyingGeo,
aes(x=longitude, y=latitude, fill=..level.., alpha=..level..), size=0.01,
bins=16, geom='polygon') +
scale_fill_gradient(low='green', high='red', guide=FALSE) +
scale_alpha(range=c(0, 0.3), guide=FALSE) +
scale_x_continuous(limits=c(min(lon), max(lon)), expand=c(0.4, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand=c(0.4, 0))
# talk about using average measures to set map location
%%R -i lobbyingGeo -w 5 -h 5 --units in -r 200
plot <- get_map(location=c(median(lobbyingGeo$longitude), median(lobbyingGeo$latitude)),
zoom=9, source='google', maptype='roadmap')
ggmap(plot, extent='device') +
geom_point(aes(x=longitude, y=latitude), data=lobbyingGeo, col="darkred", alpha=0.4,
size=lobbyingGeo$lobbying/max(lobbyingGeo$lobbying)*15) +
scale_alpha(range = c(0, 0.3), guide = FALSE)
# import matplotlib.pyplot as plt
# plt.rcParams['figure.figsize'] = (20,10)
# refinery.plot()
%%R -o all_states
# require(maps)
all_states <- map_data("state")
refinery = pd.read_excel("19. Data Set - Module 6 - accidents-state.xlsx")[:-1]
refinery.head()
refinery["State"] = refinery["State"].str.lower()
nulls = all_states.join(refinery.set_index("State"), how="right", on="region")
nulls[nulls.isnull().any(1) == True]
# note on how guam, hawaii and alaska are low so will be ignored.
# puerto rico and the virgin islands very high so will be ignored and explored for further interest.
# all will be left in the frame in order to facilitate any potenential calculations.
refinery.rename(columns={"# of RMP facilities":"facility_count", "# of accidents":"accidents",
"# of deaths":"deaths", "# of injuries":"injuries",
"# evacuated":"evacuated", "Property damage (dollars)":"property_damage"}, inplace=True)
refineryGeo = all_states.join(refinery.set_index("State"), how="right", on="region")
refineryGeo.drop(columns="subregion", inplace=True)
refineryGeo.head()
%%R -i refineryGeo -w 7 -h 3 --units in -r 180
lat <- c(min(all_states$lat), max(all_states$lat))
lon <- c(min(all_states$lon), max(all_states$lon))
plot <- get_map(location=c(mean(lon), mean(lat)), zoom=4, source='stamen', maptype='toner')
facility_count <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=facility_count), colour='black', size=0.1, alpha=0.4) +
scale_fill_continuous(low='thistle2', high='darkblue', guide=FALSE)
damage <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=property_damage), colour='black', size=0.1, alpha=0.4) +
scale_fill_continuous(low='thistle2', high='chocolate', guide=FALSE)
injuries <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=injuries), colour='black', size=0.1, alpha=0.4) +
scale_fill_continuous(low='thistle2', high='darkred', guide=FALSE)
accidents <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=accidents), colour='black', size=0.1, alpha=0.4) +
scale_fill_continuous(low='thistle2', high='darkblue', guide=FALSE)
evacuated <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=evacuated), colour='black', size=0.1, alpha=0.4) +
scale_fill_continuous(low='thistle2', high='chocolate', guide=FALSE)
deaths <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo, aes(x=long, y=lat, group=group, fill=deaths), colour='black', size=0.1, alpha=0.4) +
scale_fill_continuous(low='thistle2', high='darkred', guide=FALSE)
plot_grid(facility_count, damage, deaths, accidents, evacuated, injuries, nrow=2, ncol=3,
labels=c('Facilities', 'Damage', 'Deaths', 'Accidents', 'Evacuated', 'Injuries'))
refinery = refinery.set_index("State").drop(index=[region for region in nulls[nulls.isnull().any(1) == True].region]).reset_index()
refinery["scaled_accidents"] = pd.DataFrame([row["accidents"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["scaled_damage"] = pd.DataFrame([row["property_damage"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["scaled_evacuated"] = pd.DataFrame([row["evacuated"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["scaled_deaths"] = pd.DataFrame([row["deaths"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["scaled_injuries"] = pd.DataFrame([row["injuries"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
refinery_scaled = pd.concat([refinery.iloc[:,[0, 1, 2, 3, 4, 5, 6]],
pd.DataFrame(scaler.fit_transform(refinery.iloc[:,[7, 8, 9, 10, 11]]),
columns=["scaled_accidents", "scaled_damage", "scaled_evacuated", "scaled_deaths", "scaled_injuries"])], axis=1)
refineryGeo_scaled = all_states.join(refinery_scaled.set_index("State"), how="right", on="region")
refineryGeo_scaled.drop(columns="subregion", inplace=True)
refineryGeo_scaled.head()
%%R -i refineryGeo_scaled -w 7 -h 3 --units in -r 180
lat <- c(min(all_states$lat), max(all_states$lat))
lon <- c(min(all_states$lon), max(all_states$lon))
plot <- get_map(location=c(mean(lon), mean(lat)), zoom=4, source='stamen', maptype='toner')
facility_count <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=facility_count), colour='black', size=0.1, alpha=0.4) +
scale_fill_continuous(low='thistle2', high='darkblue', guide=FALSE)
damage <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_damage), colour='black', size=0.1, alpha=0.4) +
lims(fill=c(0, 1)) +
scale_fill_continuous(low='thistle2', high='chocolate') +
theme(legend.position='none')
injuries <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_injuries), colour='black', size=0.1, alpha=0.4) +
lims(fill=c(0, 1)) +
scale_fill_continuous(low='thistle2', high='darkred') +
theme(legend.position='none')
accidents <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_accidents), colour='black', size=0.1, alpha=0.4) +
lims(fill=c(0, 1)) +
scale_fill_continuous(low='thistle2', high='darkblue') +
theme(legend.position='none')
evacuated <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_evacuated), colour='black', size=0.1, alpha=0.4) +
lims(fill=c(0, 1)) +
scale_fill_continuous(low='thistle2', high='chocolate') +
theme(legend.position='none')
deaths <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=scaled_deaths), colour='black', size=0.1, alpha=0.4) +
lims(fill=c(0, 1)) +
scale_fill_continuous(low='thistle2', high='darkred') +
theme(legend.position='none')
plot_grid(facility_count, damage, deaths, accidents, evacuated, injuries, nrow=2, ncol=3,
labels=c('Facilities', 'Damage', 'Deaths', 'Accidents', 'Evacuated', 'Injuries'))
%%R -i refineryGeo_scaled -w 2 -h 3 --units in -r 180
lat <- c(min(all_states$lat), max(all_states$lat))
lon <- c(min(all_states$lon), max(all_states$lon))
plot <- get_map(location=c(mean(lon), mean(lat)), zoom=4, source='stamen', maptype='toner')
collateral <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=(scaled_damage+scaled_evacuated)/2), colour='black', size=0.1, alpha=0.4) +
lims(fill=c(0, 1)) +
scale_fill_continuous(low='thistle2', high='chocolate') +
theme(plot.title=element_text(size=10), legend.position='none') +
ggtitle('Collateral Damage')
human_cost <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=(scaled_injuries+scaled_deaths)/2), colour='black', size=0.1, alpha=0.4) +
lims(fill=c(0, 1)) +
scale_fill_continuous(low='thistle2', high='darkred') +
theme(plot.title=element_text(size=10), legend.position='none') +
ggtitle('Human Cost')
plot_grid(collateral, human_cost, nrow=2,
labels=NULL)
%%R -i refineryGeo_scaled -w 7 -h 3 --units in -r 180
lat <- c(min(all_states$lat), max(all_states$lat))
lon <- c(min(all_states$lon), max(all_states$lon))
plot <- get_map(location=c(mean(lon), mean(lat)), zoom=4, source='stamen', maptype='toner')
collateral <- ggmap(plot, extent='device') +
scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
# geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=(scaled_damage+scaled_evacuated)/2), colour='black', size=0.1, alpha=0.4) +
# lims(fill=c(0, 1)) +
# scale_fill_continuous(low='thistle2', high='chocolate') +
theme(plot.title=element_text(size=10), legend.position='none') +
ggtitle('Collateral Damage') +
geom_density2d(data=refineryGeo_scaled, aes(x=long, y=lat), size=0.1, color='black') +
stat_density2d(data=refineryGeo_scaled,
aes(x=long, y=lat, fill=..level.., alpha=..level..), size=0.01,
bins=16, geom='polygon') +
scale_fill_gradient(low='green', high='red', guide=FALSE) +
scale_alpha(range=c(0, 0.3), guide=FALSE)
collateral
# human_cost <- ggmap(plot, extent='device') +
# scale_x_continuous(limits=c(min(lon), max(lon)), expand = c(0, 0)) +
# scale_y_continuous(limits=c(min(lat), max(lat)), expand = c(0, 0)) +
# geom_polygon(data=refineryGeo_scaled, aes(x=long, y=lat, group=group, fill=(scaled_injuries+scaled_deaths)/2), colour='black', size=0.1, alpha=0.4) +
# lims(fill=c(0, 1)) +
# scale_fill_continuous(low='thistle2', high='darkred') +
# theme(plot.title=element_text(size=10), legend.position='none') +
# ggtitle('Human Cost')
# plot_grid(collateral, human_cost, nrow=2,
# labels=NULL)
import bs4 as bs
import requests
def state_land_area():
resp = requests.get("https://en.wikipedia.org/wiki/List_of_U.S._states_and_territories_by_area")
soup = bs.BeautifulSoup(resp.text, "lxml")
table = soup.find("table", {"class" : "wikitable sortable"})
landmass = []
for row in table.findAll("tr")[2:]:
state = row.findAll("td")[6].text
landmass.append(state)
return landmass
refinery = refinery.set_index("State").drop(index=[region for region in nulls[nulls.isnull().any(1) == True].region]).reset_index()
refinery["land_scaled_accidents"] = pd.DataFrame([row["accidents"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["land_scaled_damage"] = pd.DataFrame([row["property_damage"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["land_scaled_evacuated"] = pd.DataFrame([row["evacuated"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["land_scaled_deaths"] = pd.DataFrame([row["deaths"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery["land_scaled_injuries"] = pd.DataFrame([row["injuries"] / row["facility_count"] for index, row in refinery.iterrows()])
refinery
state_land_area()
scaler = MinMaxScaler()
refinery_scaled = pd.concat([refinery.iloc[:,[0, 1, 2, 3, 4, 5, 6]],
pd.DataFrame(scaler.fit_transform(refinery.iloc[:,[7, 8, 9, 10, 11]]),
columns=["scaled_accidents", "scaled_damage", "scaled_evacuated", "scaled_deaths", "scaled_injuries"])], axis=1)
refineryGeo_scaled = all_states.join(refinery_scaled.set_index("State"), how="right", on="region")
refineryGeo_scaled.drop(columns="subregion", inplace=True)
refineryGeo_scaled.head()